Beatriz Crispim (m20201835)
Inês Resende (m20200844)
João Tiago Homem (m20201791)
Tiago Rodrigues (m20201771)
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm
import os
import shutil
from zipfile import ZipFile
import warnings
warnings.filterwarnings('ignore')
from pylab import cm
import matplotlib.font_manager
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from scipy import stats
import matplotlib.patches as mpatches
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import math
plt.rcParams['font.size'] = 12
plt.rcParams['mathtext.fontset'] = 'custom'
plt.rcParams['mathtext.rm'] = 'Bitstream Vera Sans'
plt.rcParams['mathtext.it'] = 'Bitstream Vera Sans:italic'
plt.rcParams['mathtext.bf'] = 'Bitstream Vera Sans:bold'
plt.rcParams["text.usetex"] = False
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['xtick.direction'] = 'inout'
plt.rcParams['ytick.direction'] = 'inout'
plt.rcParams['xtick.major.size'] = 5.0
plt.rcParams['xtick.minor.size'] = 3.0
plt.rcParams['ytick.major.size'] = 5.0
plt.rcParams['ytick.minor.size'] = 3.0
plt.rcParams['figure.dpi'] = 100
plt.rcParams['lines.markersize'] = 5
plt.rcParams['lines.linewidth'] = 0.7
plt.rcParams['lines.linestyle'] = '--'
plt.rcParams['lines.marker'] = '.'
plt.rcParams['figure.figsize'] = 8,6
#Auxiliar function
def stats (lista):
mean = '{:.2f}'.format(lista.mean())
Q1 = '{:.2f}'.format(np.percentile(lista, 25))
Q3 = '{:.2f}'.format(np.percentile(lista, 75))
skew = '{:.2f}'.format(sp.stats.skew(lista) )
return (mean, Q1, Q3, skew)
#Auxiliar function
def statss (lista):
mean = '{:.2f}'.format(lista.median())
Q1 = '{:.2f}'.format(np.percentile(lista, 25))
Q3 = '{:.2f}'.format(np.percentile(lista, 75))
skew = '{:.2f}'.format(sp.stats.skew(lista) )
return (mean, Q1, Q3, skew)
def plots1 (var):
collumn=df[var]
maxim=collumn.max()
minim=collumn.min()
lista = df[df['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (5,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 30)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.15, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=8,
color= 'k')
axes[1,0].text(0.5, -0.15, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=8,
color= 'k')
axes[1,0].text(0, -0.21, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=8,
color= 'k')
axes[1,0].text(0.5, -0.21, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=8,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = df[df['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 30)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.15, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=8,
color= 'k')
axes[1,1].text(0.5, -0.15, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=8,
color= 'k')
axes[1,1].text(0, -0.21, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=8,
color= 'k')
axes[1,1].text(0.5, -0.21, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=8,
color= 'k')
axes[1,1].set_ylabel('')
plt.xlabel(var)
plt.xlim(minim,maxim)
plt.show()
The goals are:
df = pd.read_csv(r'C:\Users\Tiago\Desktop\Human resources\HR_DS.csv')
print('Rows number before' ,df.shape[0])
df.drop_duplicates(inplace = True)
print('Rows number after removing duplicates' ,df.shape[0])
df.set_index('EmployeeNumber', inplace = True)
df.head()
df.info()
df.describe(include=[np.number]).T
df.describe(include=[object]).T
sk = df.skew()
sk
kt = df.kurt()
kt
#columns with kurt=0
df=df.drop(columns=['EmployeeCount', 'StandardHours'])
df['Attrition_aux'] = np.where((df['Attrition']=='Yes'),1,0)
df.head()
df_corr = df.corr(method = 'spearman')
figure = plt.figure(figsize=(16,10))
g=sns.heatmap(df_corr, annot=True, fmt = '.1g')
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 80
df_aux = df[['YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager']].copy()
df_corr = df_aux.corr(method = 'spearman')
figure = plt.figure(figsize=(5,5))
g=sns.heatmap(df_corr, annot=True, fmt = '.1g')
df['age_bins'] = pd.cut(x=df['Age'], bins=[18, 25, 30, 35,40,45,50,55,60])
df['HourlyRate_bins'] = pd.cut(x=df['HourlyRate'], bins=[30, 40, 50, 60,70,80,90,100])
plt.rcParams['font.size'] = 11
plt.rcParams['figure.dpi'] = 80
Total=df['Attrition'].value_counts().sum()
values = df['Attrition'].value_counts().keys().tolist()
counts = df['Attrition'].value_counts().tolist()
sns.barplot(x=values,y=counts, palette="deep")
ax = plt.gca()
position = 0
for value in counts:
ax.text(position, value+10, '{:.1f}%'.format(value / Total * 100) +' (' + str(value) +')', color = 'black', ha = 'center', size = 'large' )
position = position + 1
plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.xlabel('Attrition')
plt.ylabel('Exmployees count')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()
var='Gender'
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 110
g = sns.catplot(x="Gender", col="Attrition", col_wrap=4,
data=df,
kind="count", height=2.5, aspect=.8, palette='inferno')
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+0.25,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+0.25,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium')
subset_df = df[df['Attrition'] == 'Yes']
soma = subset_df[var].value_counts().sum()
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+0.2,
p.get_height() /2.5, "{:.3}".format((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='w',
rotation='horizontal',
size='medium')
subset_df = df[df['Attrition'] == 'No']
soma = subset_df[var].value_counts().sum()
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+0.1,
p.get_height() /2.5, "{:.4}".format((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='w',
rotation='horizontal',
size='medium')
variaveis = ['Gender']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6, 5), dpi=100)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax=sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=subset_df,edgecolor='k' )
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
plt.text(txt_x+p.get_width()/ 2.0,txt_y+0.05,txt, ha='center', va='bottom',fontsize='13')
rects = ax.patches
plt.ylabel('Percentage of employees in attrition ')
plt.title(var)
plt.tight_layout()
plt.show()
variaveis = ['Age','DistanceFromHome']
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=100)
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
res = stats (df[df['Attrition'] == 'No'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
for i in range (1,2):
axes[i].set_ylabel('')
axes[i].set_xlabel('')
for i in range (0,2):
axes[i].grid(color='grey', linestyle='-.', linewidth=0.5)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=300)
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
res = stats (df[df['Attrition'] == 'Yes'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=12,
color= 'k')
for i in range (0,2):
axes[i].grid(color='grey', linestyle='-.', linewidth=0.7)
for i in range (1,2):
axes[i].set_ylabel('')
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 100
plots1('Age')
plt.rcParams['font.size'] = 10
variaveis = ['EducationField','MaritalStatus']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=variaveis_len, figsize=(12, 3), dpi=100)
for i,var in enumerate(variaveis):
df_aux = (df.groupby(['Attrition'])[var]
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
)
ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set2', data=df_aux,ax = axes[i])
ax.set_ylim(0,60)
ax.set_ylabel('Percentage of employees (%)')
ax.set_xlabel('')
_ = plt.setp(ax.get_xticklabels(), rotation=45)
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.3,txt, ha='center', va='bottom')
rects = ax.patches
variaveis = ['EducationField','MaritalStatus']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=variaveis_len, figsize=(12, 3), dpi=300)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k',
ax = axes[i])
for i, v in enumerate(subset_counts):
ax.text(v, i+0.1, str(v) , color='k', fontweight='bold',fontsize=12,ha='right')
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['MaritalStatus']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(5, 3), dpi=100)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
for i, v in enumerate(subset_counts):
ax.text(v, i+0.1, str(v) + " %", color='k', fontweight='bold',fontsize=12,ha='right')
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['BusinessTravel']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,4), dpi=80)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+2,txt_y+0.5,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,35)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 7
variaveis = ['YearsWithCurrManager', 'YearsInCurrentRole', 'YearsAtCompany','YearsSinceLastPromotion']
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 6), dpi=300)
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
res = stats (df[df['Attrition'] == 'No'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].grid(color='grey', linestyle=':', linewidth=0.5)
axes[i].spines['top'].set_visible(False)
axes[i].spines['right'].set_visible(False)
for i in range (1,3):
axes[i].set_ylabel('')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(12, 6), dpi=300)
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
res = stats (df[df['Attrition'] == 'Yes'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=10,
color= 'k')
axes[i].grid(color='grey', linestyle=':', linewidth=0.5)
axes[i].spines['top'].set_visible(False)
axes[i].spines['right'].set_visible(False)
for i in range (1,3):
axes[i].set_ylabel('')
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 8
plt.rcParams['figure.dpi'] = 100
plots1('YearsAtCompany')
variaveis = ['Department','JobRole','OverTime','BusinessTravel','JobLevel']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(8, 20), dpi=100)
for i,var in enumerate(variaveis):
df_aux = (df.groupby(['Attrition'])[var]
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
)
ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set1', data=df_aux,ax = axes[i])
# ax.set_ylim(0,100)
ax.set_ylabel('Percentage of employees (%)')
ax.set_xlabel('')
ax.set_title(var)
ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
_ = plt.setp(ax.get_xticklabels(), rotation=20)
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')
rects = ax.patches
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 8
variaveis = ['Department','JobRole','OverTime','BusinessTravel']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(6, 8), dpi=200)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=subset_df,edgecolor='k',
ax = axes[i])
for i, v in enumerate(subset_counts):
ax.text(v, i+0.1, str(v) , color='w', fontweight='bold',fontsize=7,ha='right')
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobRole']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+3.5,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,50)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['OverTime']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(4,3), dpi=80)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='steelblue', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+7
,txt_y+0.55,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,50)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
variaveis = ['JobLevel']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6, 5), dpi=100)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax=sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=subset_df,edgecolor='k' )
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
plt.text(txt_x+p.get_width()/ 2.0,txt_y+0.02,txt + ' %', ha='center', va='bottom',fontsize='13')
rects = ax.patches
plt.ylabel('Percentage of employees in attrition ')
plt.title(var)
plt.tight_layout()
plt.show()
plt.figure(figsize=(7,5),dpi=100)
box_plot = sns.boxplot(x="JobLevel", y="YearsAtCompany", data=df,palette="crest",hue='Attrition')
medians = df.groupby(['JobLevel'])['YearsAtCompany'].median()
vertical_offset =df['YearsAtCompany'].median() * 0.1
# for xtick in box_plot.get_xticks():
# box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
# horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("JobLevel")
plt.ylabel("YearsAtCompany")
plt.show()
var1='JobRole'
var2='BusinessTravel'
plt.rcParams['font.size'] = 10
df_aux_YES = df.loc[df['Attrition'] == 'Yes']
df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.BusinessTravel ,normalize='index')*100
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(7, 5), dpi=100)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10)
axes[0].set_ylabel(var1, fontsize = 10)
df_aux_No = df.loc[df['Attrition'] == 'No']
df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.BusinessTravel,normalize='index')*100
sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu" ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10)
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
var1='JobRole'
var2='JobLevel'
df_aux_YES = df.loc[df['Attrition'] == 'Yes']
df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.JobLevel ,normalize='index')*100
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), dpi=100)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10)
axes[0].set_ylabel(var1, fontsize = 10)
df_aux_No = df.loc[df['Attrition'] == 'No']
df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.JobLevel,normalize='index')*100
sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu" ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10)
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
variaveis = ['JobInvolvement','WorkLifeBalance']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(4, 6), dpi=120)
for i,var in enumerate(variaveis):
df_aux = (df.groupby(['Attrition'])[var]
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
)
ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set2', data=df_aux,ax = axes[i])
# ax.set_ylim(0,100)
ax.set_ylabel('Percentage of employees (%)')
ax.set_xlabel('')
ax.set_title(var)
ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')
rects = ax.patches
sns.despine(ax=axes[i], offset=5)
plt.tight_layout()
plt.show()
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance']
plt.rcParams['font.size'] = 8
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(5, 13), dpi=120)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax=sns.barplot(x=subset_values, y=subset_counts, color='green', data=subset_df,edgecolor='k' ,ax=axes[i] )
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='top',fontsize=10,color='w')
rects = ax.patches
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
plt.tight_layout()
plt.show()
var1='JobLevel'
var2='WorkLifeBalance'
plt.rcParams['font.size'] = 10
df_aux_YES = df.loc[df['Attrition'] == 'Yes']
df_var1_vs_var2= pd.crosstab(df_aux_YES.JobLevel, df_aux_YES.WorkLifeBalance ,normalize='index')*100
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(6, 3), dpi=120)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10)
axes[0].set_ylabel(var1, fontsize = 10)
df_aux_No = df.loc[df['Attrition'] == 'No']
df_var1_vs_var3= pd.crosstab(df_aux_No.JobLevel, df_aux_No.WorkLifeBalance,normalize='index')*100
sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu" ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10)
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
plt.figure(figsize=(6,5),dpi=100)
box_plot = sns.boxplot(x="JobSatisfaction", y="YearsAtCompany", data=df,palette="crest",hue='Attrition')
medians = df.groupby(['JobSatisfaction'])['YearsAtCompany'].median()
vertical_offset =df['YearsAtCompany'].median() * 0.1
# for xtick in box_plot.get_xticks():
# box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
# horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("JobSatisfaction")
plt.ylabel("YearsAtCompany")
plt.show()
var1='JobRole'
var2='JobSatisfaction'
df_aux_YES = df.loc[df['Attrition'] == 'Yes']
df_var1_vs_var2= pd.crosstab(df_aux_YES.JobRole, df_aux_YES.JobSatisfaction ,normalize='index')*100
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4), dpi=150)
sns.heatmap(df_var1_vs_var2, linewidths=.5, annot=True, cmap="YlGnBu" , fmt='.1f',annot_kws={"size": 8},ax = axes[0] )
axes[0] .set_title('Attrition = YES' )
axes[0].set_xlabel(var2, fontsize = 10)
axes[0].set_ylabel(var1, fontsize = 10)
df_aux_No = df.loc[df['Attrition'] == 'No']
df_var1_vs_var3= pd.crosstab(df_aux_No.JobRole, df_aux_No.JobSatisfaction,normalize='index')*100
sns.heatmap(df_var1_vs_var3, linewidths=.5, annot=True, cmap="YlGnBu" ,fmt='.1f', annot_kws={"size": 8},ax = axes[1] )
axes[1] .set_title('Attrition = No' )
axes[1].set_xlabel(var2, fontsize = 10)
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['MonthlyIncome','PercentSalaryHike']
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'No'][var],ax = axes[i], bins = 20,color='darkgreen').set_title(str(var)+ ' (Attrition= No)')
res = stats (df[df['Attrition'] == 'No'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
for i in range (1,2):
axes[i].set_ylabel('')
for i in range (0,2):
axes[i].grid(color='grey', linestyle='-.', linewidth=0.5)
sns.despine(ax=axes[i], offset=5)
axes[i].set_xlabel('')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), dpi=300)
for i,var in enumerate(variaveis):
sns.histplot(df[df['Attrition'] == 'Yes'][var],ax = axes[i], bins = 20,color='darkred').set_title(str(var)+ ' (Attrition= Yes)')
res = stats (df[df['Attrition'] == 'Yes'][var])
axes[i].text(0.1+i*1.25, -0.15, 'Mean = ' + str(res[0]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.2, 'Skewness = ' + str(res[3]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.25, 'Q1 = ' + str(res[1]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
axes[i].text(0.1+i*1.25, -0.3, 'Q3 = ' + str(res[2]), transform=axes[0].transAxes, fontsize=13,
color= 'k')
for i in range (0,2):
axes[i].grid(color='grey', linestyle='-.', linewidth=0.7)
axes[i].set_xlabel('')
sns.despine(ax=axes[i], offset=5)
for i in range (1,2):
axes[i].set_ylabel('')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots( nrows=1, ncols=1, figsize=(8, 5), dpi=100)
ax = sns.boxplot(x="JobRole", y="MonthlyIncome", data=df,palette="GnBu",hue='Attrition')
medians = df.groupby(['JobRole'])['MonthlyIncome'].median()
vertical_offset =df['MonthlyIncome'].median() * 0.1
ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
_ = plt.setp(ax.get_xticklabels(), rotation=20,ha='right')
plt.xlabel("JobRole")
plt.ylabel("MonthlyIncome")
plt.show()
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
plt.figure(figsize=(10,6))
box_plot = sns.boxplot(x="age_bins", y="MonthlyIncome", data=df,palette="crest",hue='Attrition')
medians = df.groupby(['age_bins'])['MonthlyIncome'].median()
vertical_offset =df['MonthlyIncome'].median() * 0.1
# for xtick in box_plot.get_xticks():
# box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick],
# horizontalalignment='center',size='medium',color='w',weight='semibold')
plt.xlabel("Age group")
plt.title ("Montly Income per age group")
plt.ylabel("Montly Income")
plt.show()
variaveis = ['PerformanceRating', 'TrainingTimesLastYear']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=2, figsize=(10, 5), dpi=100)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax=sns.barplot(x=subset_values, y=subset_counts, color='green', data=subset_df,edgecolor='k' ,ax=axes[i] )
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='top',fontsize=14,color='w')
rects = ax.patches
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
sns.despine(ax=ax, offset=5)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['TrainingTimesLastYear']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=df_new,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='top',fontsize=16,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10,6))
box_plot = sns.boxplot(x="PerformanceRating", y="PercentSalaryHike", data=df,palette="Set2",hue='Attrition')
plt.ylabel("PercentSalaryHike")
plt.xlabel("PerformanceRating")
plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.show()
plt.figure(figsize=(10,6))
plt.rcParams['font.size'] = 13
plt.rcParams['figure.dpi'] = 100
box_plot = sns.boxplot(y="age_bins", x="YearsSinceLastPromotion", data=df,palette="Set3",hue='Attrition')
plt.ylabel("Age group")
plt.title ("YearsSinceLastPromotion per age group")
plt.xlabel("YearsSinceLastPromotion")
plt.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
plt.show()
variaveis = ['StockOptionLevel','age_bins']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=variaveis_len, ncols=1, figsize=(6, 10), dpi=100)
for i,var in enumerate(variaveis):
df_aux = (df.groupby(['Attrition'])[var]
.value_counts(normalize=True)
.rename('percentage')
.mul(100)
.reset_index()
)
ax = sns.barplot(x=var, y="percentage", hue="Attrition", palette='Set1', data=df_aux,ax = axes[i])
# ax.set_ylim(0,100)
ax.set_ylabel('Percentage of employees (%)')
ax.set_xlabel('')
ax.set_title(var)
ax.grid(axis='y',color='grey', linestyle='-.', linewidth=0.5)
for p in axes[i].patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
axes[i].text(txt_x+p.get_width()/ 2.0,txt_y+0.1,txt, ha='center', va='bottom')
_ = plt.setp(ax.get_xticklabels(), rotation=20,ha='right')
rects = ax.patches
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['StockOptionLevel']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='steelblue', data=df_new,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightblue')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='top',fontsize=16,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['YearsAtCompany']
Total=df['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8, 6), dpi=100)
for i,var in enumerate(variaveis):
subset_df = df[df['Attrition'] == 'Yes']
subset_counts = subset_df[var].value_counts().tolist()
subset_values = subset_df[var].value_counts().keys().tolist()
total_counts=[]
for x in subset_values:
total_counts.append((df[var].values == x).sum())
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/total_counts[k]*100
subset_counts[k]=round(subset_counts[k],1)
ax=sns.barplot(x=subset_values, y=subset_counts, color='red', data=subset_df,edgecolor='k' ,alpha=0.8 )
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt, ha='center', va='bottom',fontsize=9,color='k')
rects = ax.patches
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
sns.despine(ax=ax, offset=5)
plt.tight_layout()
plt.show()
plt.figure(figsize=(8,4))
sns.scatterplot(x = 'YearsAtCompany', y= 'TotalWorkingYears',hue='Attrition', data = df ,palette ='flare')
plt.xlabel("YearsAtCompany")
plt.ylabel("TotalWorkingYears")
# plt.legend(loc='lower left', bbox_to_anchor=(0.6,0.5))
plt.legend(shadow=True, fancybox=True, ncol = 5, title="Attrition")
# plt.ylim(top=4000)
plt.show()
plt.figure(figsize=(8,4))
sns.scatterplot(x = 'YearsAtCompany', y= 'YearsSinceLastPromotion',hue='Attrition', data = df ,palette ='flare')
plt.xlabel("YearsAtCompany")
plt.ylabel("YearsSinceLastPromotion")
# plt.legend(loc='lower left', bbox_to_anchor=(0.6,0.5))
plt.legend(shadow=True, fancybox=True, ncol = 5, title="Attrition")
# plt.ylim(top=4000)
plt.show()
df_transf=df.copy()
df_transf.describe(include=[object]).T
df_transf['OverTime_2'] = df_transf ['OverTime'].replace('No',0).replace('Yes',1)
df_transf['Attrition_2'] = np.where((df['Attrition']=='Yes'),1,0)
df_transf['Single'] = np.where((df['MaritalStatus']=='Single'),1,0)
df_transf['Travel'] = np.where((df['BusinessTravel']=='Non-Travel'),0,1)
# df_transf2=df_transf.drop(['OverTime', 'Attrition','MaritalStatus','BusinessTravel','Department',
# 'EducationField','Gender','Over18','Attrition_aux','age_bins','HourlyRate_bins'], axis=1)
df_dummies = pd.get_dummies(df_transf)
df_dummies
scaler = MinMaxScaler().fit(df_dummies)
scaled_df = scaler.transform(df_dummies)
cols = df_dummies.columns
scaled_df = pd.DataFrame(scaled_df, columns = cols)
scaled_df.head().T
df_seg_att=df.loc[df['Attrition'] == 'Yes']
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobRole']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
print(subset_counts)
subset_values = df_seg_att[var].value_counts().keys().tolist()
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+3.5,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,50)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['StockOptionLevel']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,4), dpi=80)
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
subset_values = df_seg_att[var].value_counts().keys().tolist()
total_counts=[]
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='top',fontsize=16,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobLevel']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
subset_values = df_seg_att[var].value_counts().keys().tolist()
total_counts=[]
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='bottom',fontsize=16,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 14
plt.rcParams['figure.dpi'] = 100
variaveis = ['JobInvolvement']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(8,6), dpi=80)
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
subset_values = df_seg_att[var].value_counts().keys().tolist()
total_counts=[]
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='bottom',fontsize=16,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
variaveis = ['BusinessTravel']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,3))
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
print(subset_counts)
subset_values = df_seg_att[var].value_counts().keys().tolist()
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+6,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,80)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
variaveis = ['OverTime']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,3))
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
print(subset_counts)
subset_values = df_seg_att[var].value_counts().keys().tolist()
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(y=subset_values, x=subset_counts, color='green', data=df_new,edgecolor='k',
order=df_new.sort_values('subset_counts',ascending = False).subset_values)
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_width().round(2))
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+6,txt_y+0.6,txt + " %", ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax.set_xlabel('Percentage of employees in attrition ')
ax.set_xlim(0,80)
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=45)
plt.tight_layout()
plt.show()
plt.rcParams['font.size'] = 9
plt.rcParams['figure.dpi'] = 100
variaveis = ['TrainingTimesLastYear']
Total=df_seg_att['Department'].value_counts().sum()
variaveis_len = len(variaveis)
fig, axes = plt.subplots( nrows=1, ncols=1, figsize=(6,4), dpi=100)
for i,var in enumerate(variaveis):
subset_counts = df_seg_att[var].value_counts().tolist()
subset_values = df_seg_att[var].value_counts().keys().tolist()
total_counts=[]
for k,j in enumerate(subset_counts):
subset_counts[k]= subset_counts[k]/Total*100
subset_counts[k]=round(subset_counts[k],1)
df_new = pd.DataFrame({"subset_counts":subset_counts,
"subset_values":subset_values})
ax = sns.barplot(x=subset_values, y=subset_counts, color='green', data=df_seg_att,edgecolor='k')
patch_h = []
for patch in ax.patches:
reading = patch.get_height()
patch_h.append(reading)
idx_tallest = np.argmax(patch_h)
# np.argmax return the index of largest value of the list
ax.patches[idx_tallest].set_facecolor('lightgreen')
values=df_new.sort_values('subset_counts',ascending = False).subset_values
for p in ax.patches:
txt = str(p.get_height().round(2))
txt_x = p.get_x()
txt_y = p.get_height()
ax.text(txt_x+p.get_width()/ 2.0,txt_y-0.3,txt + ' %', ha='center', va='bottom',fontsize=12,color='k')
ax.set_ylabel('Percentage of employees in attrition ')
ax.set_title(var)
_ = plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout()
plt.show()
var='Gender'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='w',
rotation='horizontal',
size='medium', ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='w',
rotation='horizontal',
size='medium',ha='center')
plt.show()
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='k',
rotation='horizontal',
size='medium', ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() /2.5, "%.1f" % ((p.get_height()/soma)*100) + " %",
#Used to format it K representation
color='k',
rotation='horizontal',
size='medium',ha='center')
plt.show()
var='MonthlyIncome'
collumn=df[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(500,7000)
axes[0,1].set_xlim(500,7000)
axes[1,0].set_xlim(500,7000)
axes[1,1].set_xlim(500,7000)
plt.xlabel(var)
plt.show()
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance']
var='JobSatisfaction'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='BusinessTravel'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = 8,6
g = sns.catplot(y=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu',edgecolor='k')
ax = g.facet_axis(0,0)
for p in ax.patches:
txt = str(p.get_width())
txt = txt.strip(".0")
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+3.5,txt_y+0.6,txt, ha='center', va='bottom',fontsize='13')
rects = ax.patches
ax = g.facet_axis(0,1)
for p in ax.patches:
txt = str(p.get_width())
txt_x = p.get_width()
txt_y = p.get_y()
plt.text(txt_x+3.5,txt_y+0.6,txt, ha='center', va='bottom',fontsize='13')
rects = ax.patches
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='YearsInCurrentRole'
collumn=df[var]
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (5,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(minim-1,maxim)
axes[0,1].set_xlim(minim-1,maxim)
axes[1,0].set_xlim(minim-1,maxim)
axes[1,1].set_xlim(minim-1,maxim)
plt.xlabel(var)
plt.show()
var='MonthlyIncome'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(500,maxim+500)
axes[0,1].set_xlim(500,maxim+500)
axes[1,0].set_xlim(500,maxim+500)
axes[1,1].set_xlim(500,maxim+500)
plt.xlabel(var)
plt.show()
var='YearsInCurrentRole'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(minim-1,maxim+2)
axes[0,1].set_xlim(minim-1,maxim+2)
axes[1,0].set_xlim(minim-1,maxim+2)
axes[1,1].set_xlim(minim-1,maxim+2)
plt.xlabel(var)
plt.show()
var='YearsAtCompany'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(minim-1,maxim+2)
axes[0,1].set_xlim(minim-1,maxim+2)
axes[1,0].set_xlim(minim-1,maxim+2)
axes[1,1].set_xlim(minim-1,maxim+2)
plt.xlabel(var)
plt.show()
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='WorkLifeBalance'
dataframe=df.loc[df['JobRole'] == 'Laboratory Technician']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance']
var='JobInvolvement'
dataframe=df.loc[df['JobRole'] == 'Sales Representative']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='MonthlyIncome'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(500,maxim+500)
axes[0,1].set_xlim(500,maxim+500)
axes[1,0].set_xlim(500,maxim+500)
axes[1,1].set_xlim(500,maxim+500)
plt.xlabel(var)
plt.show()
var='OverTime'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='TrainingTimesLastYear'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(0,maxim+3)
axes[0,1].set_xlim(0,maxim+3)
axes[1,0].set_xlim(0,maxim+3)
axes[1,1].set_xlim(0,maxim+3)
plt.xlabel(var)
plt.show()
var='StockOptionLevel'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
txt = str(p.get_height())
txt = txt.strip(".0")
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
txt,
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
txt = str(p.get_height())
txt = txt.strip(".0")
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
txt, color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='JobLevel'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='inferno', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
txt = int(p.get_height())
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
txt,
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
txt = (p.get_height()).astype('Int64')
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
txt, color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='DistanceFromHome'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(0,maxim+2)
axes[0,1].set_xlim(0,maxim+2)
axes[1,0].set_xlim(0,maxim+2)
axes[1,1].set_xlim(0,maxim+2)
plt.xlabel(var)
plt.show()
variaveis = ['EnvironmentSatisfaction', 'JobSatisfaction', 'RelationshipSatisfaction','JobInvolvement','WorkLifeBalance']
var='EnvironmentSatisfaction'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
plt.rcParams['font.size'] = 10
plt.rcParams['figure.dpi'] = 100
g = sns.catplot(x=var, col="Attrition", col_wrap=4,
data=dataframe,
kind="count", height=2.5, aspect=.8, palette='RdBu', legend=['yes','no'])
ax = g.facet_axis(0,0)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()), #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
ax = g.facet_axis(0,1)
for p in ax.patches:
ax.text(p.get_x()+ p.get_width()/2,
p.get_height() * 1.02,
(p.get_height()).astype('Int64') , #Used to format it K representation
color='black',
rotation='horizontal',
size='medium',ha='center')
subset_df = dataframe[dataframe[var] == 1]
soma=dataframe.shape[0]
# soma = subset_df[var].value_counts().sum()
plt.show()
var='PercentSalaryHike'
dataframe=df.loc[df['JobRole'] == 'Human Resources']
collumn=dataframe[var]
plt.rcParams['font.size'] = 12
plt.rcParams['figure.dpi'] = 100
maxim=collumn.max()
minim=collumn.min()
lista = dataframe[dataframe['Attrition'] == 'Yes']
fig, axes = plt.subplots(2,2, figsize = (7,6),gridspec_kw={'height_ratios': [1, 4]})
g1=sns.histplot(ax = axes[1,0], data = lista ,x = var, color="coral", bins = 10)
g2=sns.boxplot(ax = axes[0,0], data = lista, x = var, color="orange" )
g2.set(xlabel=None)
g2.set(title='Attrition = Yes')
res = stats (lista[var])
axes[1,0].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
axes[1,0].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,0].transAxes, fontsize=10,
color= 'k')
plt.xlim(minim,maxim)
plt.xlabel(var)
lista = dataframe[dataframe['Attrition'] == 'No']
g1=sns.histplot(ax = axes[1,1], data = lista ,x = var, color="dodgerblue", bins = 10)
g2=sns.boxplot(ax = axes[0,1], data = lista, x = var, color="skyblue" )
g2.set(xlabel=None)
g2.set(title='Attrition = No')
res = stats (lista[var])
axes[1,1].text(0, -0.2, 'Mean = ' + str(res[0]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.2, 'Skewness = ' + str(res[3]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0, -0.25, 'Q1 = ' + str(res[1]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].text(0.5, -0.25, 'Q3 = ' + str(res[2]), transform=axes[1,1].transAxes, fontsize=10,
color= 'k')
axes[1,1].set_ylabel('')
axes[0,0].set_xlim(0,maxim+2)
axes[0,1].set_xlim(0,maxim+2)
axes[1,0].set_xlim(0,maxim+2)
axes[1,1].set_xlim(0,maxim+2)
plt.xlabel(var)
plt.show()